Pool Size Differential Expression Analysis

Author

Felix Pförtner

Published

April 6, 2025

Differential expression analysis comparing different pool sizes (080ng, 320ng, and 920ng).

Setup

library(tidyverse)
library(limma)
library(edgeR)
library(plyranges)
library(scales)
library(DT)
library(UpSetR)
library(clusterProfiler)
library(org.Mm.eg.db)
library(enrichplot)
library(ggplot2)
source("/data/share/htp/prime-seq_NextGen/scripts/poolsize_functions.R")

Data Loading

Read Count Data

counts_080 <- as.data.frame(as.matrix(readRDS("/data/share/htp/prime-seq_NextGen/data/FC2024_08_01_poolsize/03_zUMIs/80ng/zUMIs_output/expression/poolsize_80ng.dgecounts.rds")$umicount$inex$all)) %>% 
  rownames_to_column(var="gene_id") %>%
  as_tibble()

counts_320 <- as.data.frame(as.matrix(readRDS("/data/share/htp/prime-seq_NextGen/data/FC2024_08_01_poolsize/03_zUMIs/320ng/zUMIs_output/expression/poolsize_320ng.dgecounts.rds")$umicount$inex$all)) %>% 
  rownames_to_column(var="gene_id") %>%
  as_tibble()

counts_920 <- as.data.frame(as.matrix(readRDS("/data/share/htp/prime-seq_NextGen/data/FC2024_08_01_poolsize/03_zUMIs/920ng/zUMIs_output/expression/poolsize_920ng.dgecounts.rds")$umicount$inex$all)) %>% 
  rownames_to_column(var="gene_id") %>%
  as_tibble()

Load Annotation Data

genes <- read.delim("/data/share/htp/prime-seq_NextGen/data/FC2024_08_01_poolsize/03_zUMIs/80ng/zUMIs_output/expression/poolsize_80ng.gene_names.txt")

gtf <- read_gff("/data/share/htp/prime-seq_NextGen/data/FC2024_08_01_poolsize/03_zUMIs/80ng/poolsize_80ng.final_annot.gtf")

Combine Datasets

# Add pool size prefixes to sample names
colnames(counts_080)[-1] <- paste0("pool080_", colnames(counts_080)[-1])
colnames(counts_320)[-1] <- paste0("pool320_", colnames(counts_320)[-1])
colnames(counts_920)[-1] <- paste0("pool920_", colnames(counts_920)[-1])

# Find common genes across all datasets
counts <- full_join(full_join(counts_080, counts_320, by="gene_id"), counts_920, by="gene_id") %>%
  mutate(across(where(is.numeric), ~replace_na(., 0))) %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, everything())

# Create sample metadata
sample_info <- data.frame(
  sample = colnames(counts)[-(1:2)],
  pool_size = c(rep("pool080", ncol(counts_080)-1), 
                rep("pool320", ncol(counts_320)-1), 
                rep("pool920", ncol(counts_920)-1)),
  pool_size_numeric = c(rep(080, ncol(counts_080)-1), 
                        rep(320, ncol(counts_320)-1), 
                        rep(920, ncol(counts_920)-1))
)

# Convert pool_size to factor with correct ordering
sample_info$pool_size <- factor(sample_info$pool_size, levels = c("pool080", "pool320", "pool920"))

# Report dimensions of the datasets
print(paste("Combined dataset:", nrow(counts), "genes,", ncol(counts)-2, "samples"))
[1] "Combined dataset: 24311 genes, 40 samples"

Analysis with All Samples

This section analyzes all available samples with different filtering strategies.

Quality Control - All Samples

Library Size Distribution - All Samples

lib_sizes <- colSums(counts[,-c(1:2)])
lib_df <- data.frame(
  sample = names(lib_sizes),
  lib_size = lib_sizes,
  pool_size = sample_info$pool_size
)

p_lib <- ggplot(lib_df, aes(x = sample, y = lib_size, fill = pool_size)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Library sizes (UMI counts)", 
       x = "Sample", y = "UMI counts", fill = "Pool size") +
  scale_fill_brewer(type = "qual", palette = "Set1") +
  scale_y_continuous(labels = comma)

print(p_lib)

PCA Before Filtering - All Samples

Principal Component Analysis helps visualize sample relationships and identify potential batch effects.

count_matrix_raw <- as.matrix(counts[,-c(1:2)])
rownames(count_matrix_raw) <- counts$gene_id
logcpm_raw <- cpm(count_matrix_raw, log=TRUE, prior.count=1)

# Remove genes with low variance for PCA
var_genes <- apply(logcpm_raw, 1, var)
high_var_genes <- order(var_genes, decreasing=TRUE)[1:1000]
pca_raw <- prcomp(t(logcpm_raw[high_var_genes,]), scale=TRUE)

pca_df_raw <- data.frame(
  PC1 = pca_raw$x[,1],
  PC2 = pca_raw$x[,2],
  sample = rownames(pca_raw$x),
  pool_size = sample_info$pool_size,
  lib_size = lib_sizes[rownames(pca_raw$x)]
)

pc1_var <- round(summary(pca_raw)$importance[2,1]*100, 1)
pc2_var <- round(summary(pca_raw)$importance[2,2]*100, 1)

# PCA colored by pool size
ggplot(pca_df_raw, aes(x = PC1, y = PC2, color = pool_size)) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(title = "PCA before filtering (colored by pool size)",
       x = paste0("PC1 (", pc1_var, "%)"),
       y = paste0("PC2 (", pc2_var, "%)"),
       color = "Pool size") +
  scale_color_brewer(type = "qual", palette = "Set1")

# PCA colored by library size and shaped by pool size
ggplot(pca_df_raw, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
  geom_point(size = 4) +
  theme_minimal() +
  labs(title = "PCA before filtering (colored by library size)",
       x = paste0("PC1 (", pc1_var, "%)"),
       y = paste0("PC2 (", pc2_var, "%)"),
       color = "Library size",
       shape = "Pool size") +
  scale_color_gradient(low = "blue", high = "red", labels = comma) +
  scale_shape_manual(values = c(16, 17, 18))

Filtering - All Samples

We apply multiple filtering steps to remove uninformative genes:

  1. Sparse genes: Remove genes with low expression (CPM > 3 in >= 5 samples)
  2. optional Highly expressed genes: Remove top 10 most highly expressed genes (likely housekeeping genes)
  3. optional Mitochondrial genes: Remove mitochondrial genes
  4. optional Ribosomal RNA genes: Remove rRNA and rRNA pseudogenes
# Filter sparse genes CPM>3 in >= 5 samples
count_matrix <- as.matrix(counts[,-c(1:2)])
rownames(count_matrix) <- counts$gene_id
cpm_values <- cpm(count_matrix)
count_matrix_zero <- count_matrix
count_matrix_zero[cpm_values < 3] <- 0
keep_sparse <- rowSums(count_matrix_zero[,-c(1:2)] > 1) >= 5

print(paste("Removed sparse genes:", sum(!keep_sparse)))
[1] "Removed sparse genes: 11442"
# Filter most highly expressed genes (remove top 10)
sum_expression <- rowSums(count_matrix)
gene_expression <- tibble(
  gene_id = counts$gene_id,
  gene_name = counts$gene_name,
  total_expression = sum_expression,
  perc = total_expression/(sum(count_matrix))*100
)

top_genes <- gene_expression %>%
  arrange(desc(total_expression)) %>%
  head(10)

knitr::kable(top_genes, caption = "Top 10 genes by total expression")
Top 10 genes by total expression
gene_id gene_name total_expression perc
ENSMUSG00000064339.1 mt-Rnr2 156926 3.3745263
ENSMUSG00000101249.2 Gm29216 112241 2.4136230
ENSMUSG00000064337.1 mt-Rnr1 111224 2.3917535
ENSMUSG00000119584.1 Rn18s-rs5 57056 1.2269284
ENSMUSG00000100862.2 Gm10925 52956 1.1387623
ENSMUSG00000064370.1 mt-Cytb 48465 1.0421881
ENSMUSG00000101111.2 Gm28437 43057 0.9258949
ENSMUSG00000064363.1 mt-Nd4 40849 0.8784142
ENSMUSG00000102070.2 Gm28661 39144 0.8417500
ENSMUSG00000064341.1 mt-Nd1 34118 0.7336712
# Filter ribo and mito genes
all_genes <- gtf %>% filter(type=="gene") %>% as_tibble
mitochondrial_genes <- all_genes %>%
  filter(seqnames == "chrM") %>%
  pull(gene_id)
rrna_genes <- all_genes %>%
  filter(gene_type %in% c("rRNA","rRNA_pseudogene")) %>%
  pull(gene_id)

# Apply all filters
counts_filt <- counts[keep_sparse,] %>%
  filter(!(gene_id %in% c(top_genes$gene_id,
                          mitochondrial_genes,
                          rrna_genes)))

counts_filt_alt <- counts[keep_sparse,]

print(paste("Filtered dataset:", nrow(counts_filt), "genes,", ncol(counts_filt)-2, "samples,", round(sum(counts_filt[,-c(1,2)])/1000000, 2), "M UMIs"))
[1] "Filtered dataset: 12840 genes, 40 samples, 3.85 M UMIs"
print(paste("Filtered dataset:", nrow(counts_filt_alt), "genes,", ncol(counts_filt_alt)-2, "samples,", round(sum(counts_filt_alt[,-c(1,2)])/1000000, 2), "M UMIs"))
[1] "Filtered dataset: 12869 genes, 40 samples, 4.59 M UMIs"

PCA After Filtering - All Samples

stringent

count_matrix_filt <- as.matrix(counts_filt[,-c(1:2)])
rownames(count_matrix_filt) <- counts_filt$gene_id
logcpm_filt <- cpm(count_matrix_filt, log=TRUE, prior.count=1)

# Remove genes with low variance for PCA
var_genes_filt <- apply(logcpm_filt, 1, var)
high_var_genes_filt <- order(var_genes_filt, decreasing=TRUE)[1:min(1000, nrow(logcpm_filt))]
pca_filt <- prcomp(t(logcpm_filt[high_var_genes_filt,]), scale=TRUE)

# Calculate library sizes for filtered data
lib_sizes_filt <- colSums(count_matrix_filt)

pca_df_filt <- data.frame(
  PC1 = pca_filt$x[,1],
  PC2 = pca_filt$x[,2],
  sample = rownames(pca_filt$x),
  pool_size = sample_info$pool_size,
  lib_size = lib_sizes_filt[rownames(pca_filt$x)]
)

pc1_var_filt <- round(summary(pca_filt)$importance[2,1]*100, 1)
pc2_var_filt <- round(summary(pca_filt)$importance[2,2]*100, 1)

# PCA colored by library size and shaped by pool size
ggplot(pca_df_filt, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
  geom_point(size = 4) +
  theme_minimal() +
  labs(title = "PCA after filtering (colored by library size)",
       x = paste0("PC1 (", pc1_var_filt, "%)"),
       y = paste0("PC2 (", pc2_var_filt, "%)"),
       color = "Library size",
       shape = "Pool size") +
  scale_color_gradient(low = "blue", high = "red", labels = comma) +
  scale_shape_manual(values = c(16, 17, 18))

lenient

count_matrix_filt <- as.matrix(counts_filt_alt[,-c(1:2)])
rownames(count_matrix_filt) <- counts_filt_alt$gene_id
logcpm_filt <- cpm(count_matrix_filt, log=TRUE, prior.count=1)

# Remove genes with low variance for PCA
var_genes_filt <- apply(logcpm_filt, 1, var)
high_var_genes_filt <- order(var_genes_filt, decreasing=TRUE)[1:min(1000, nrow(logcpm_filt))]
pca_filt <- prcomp(t(logcpm_filt[high_var_genes_filt,]), scale=TRUE)

# Calculate library sizes for filtered data
lib_sizes_filt <- colSums(count_matrix_filt)

pca_df_filt <- data.frame(
  PC1 = pca_filt$x[,1],
  PC2 = pca_filt$x[,2],
  sample = rownames(pca_filt$x),
  pool_size = sample_info$pool_size,
  lib_size = lib_sizes_filt[rownames(pca_filt$x)]
)

pc1_var_filt <- round(summary(pca_filt)$importance[2,1]*100, 1)
pc2_var_filt <- round(summary(pca_filt)$importance[2,2]*100, 1)

# PCA colored by library size and shaped by pool size
ggplot(pca_df_filt, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
  geom_point(size = 4) +
  theme_minimal() +
  labs(title = "PCA after filtering (colored by library size)",
       x = paste0("PC1 (", pc1_var_filt, "%)"),
       y = paste0("PC2 (", pc2_var_filt, "%)"),
       color = "Library size",
       shape = "Pool size") +
  scale_color_gradient(low = "blue", high = "red", labels = comma) +
  scale_shape_manual(values = c(16, 17, 18))

DEG

stringent

Model Diagnostics - All Samples

Before performing differential expression analysis.

# Convert to DGEList and calculate normalization factors
dge <- DGEList(counts = counts_filt %>% column_to_rownames(var="gene_id"), samples = sample_info)
Setting first column of `counts` as gene annotation.
dge <- calcNormFactors(dge)

# Estimate dispersion and create dispersion plot
dge <- estimateDisp(dge, design=model.matrix(~ pool_size, data = sample_info))

Statistical Analysis - All Samples

We use limma-trend for differential expression analysis, which is appropriate for RNA-seq data.

# Create design matrix
design <- model.matrix(~ 0 + pool_size, data = sample_info)
colnames(design) <- levels(sample_info$pool_size)

# Estimate mean-variance trend and fit linear model
logCPM <- cpm(dge, log = TRUE, prior.count = 1)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend = TRUE)

# Define contrasts for pairwise comparisons
contrasts <- makeContrasts(
  pool320_vs_pool080 = pool320 - pool080,
  pool920_vs_pool080 = pool920 - pool080,
  pool920_vs_pool320 = pool920 - pool320,
  levels = design
)

# Fit contrasts
fit_contrasts <- contrasts.fit(fit, contrasts)
fit_contrasts <- eBayes(fit_contrasts, trend = TRUE)

# Extract results for each comparison
results_320_vs_080 <- topTable(fit_contrasts, coef = "pool320_vs_pool080", 
                               number = Inf, sort.by = "P")
results_920_vs_080 <- topTable(fit_contrasts, coef = "pool920_vs_pool080", 
                               number = Inf, sort.by = "P")
results_920_vs_320 <- topTable(fit_contrasts, coef = "pool920_vs_pool320", 
                               number = Inf, sort.by = "P")

Results Summary - Stringent Filtering

# Summary for 320ng vs 080ng
sig_320_vs_080 <- results_320_vs_080[results_320_vs_080$adj.P.Val < 0.05,]
up_320_vs_080 <- sum(sig_320_vs_080$logFC > 0)
down_320_vs_080 <- sum(sig_320_vs_080$logFC < 0)
print(paste("320ng vs 080ng:", nrow(sig_320_vs_080), "DE genes"))
[1] "320ng vs 080ng: 3 DE genes"
print(paste("  Up-regulated:", up_320_vs_080))
[1] "  Up-regulated: 3"
print(paste("  Down-regulated:", down_320_vs_080))
[1] "  Down-regulated: 0"
# Summary for 920ng vs 080ng
sig_920_vs_080_summary <- results_920_vs_080[results_920_vs_080$adj.P.Val < 0.05,]
up_920_vs_080 <- sum(sig_920_vs_080_summary$logFC > 0)
down_920_vs_080 <- sum(sig_920_vs_080_summary$logFC < 0)
print(paste("920ng vs 080ng:", nrow(sig_920_vs_080_summary), "DE genes"))
[1] "920ng vs 080ng: 27 DE genes"
print(paste("  Up-regulated:", up_920_vs_080))
[1] "  Up-regulated: 17"
print(paste("  Down-regulated:", down_920_vs_080))
[1] "  Down-regulated: 10"
# Summary for 920ng vs 320ng
sig_920_vs_320 <- results_920_vs_320[results_920_vs_320$adj.P.Val < 0.05,]
up_920_vs_320 <- sum(sig_920_vs_320$logFC > 0)
down_920_vs_320 <- sum(sig_920_vs_320$logFC < 0)
print(paste("920ng vs 320ng:", nrow(sig_920_vs_320), "DE genes"))
[1] "920ng vs 320ng: 15 DE genes"
print(paste("  Up-regulated:", up_920_vs_320))
[1] "  Up-regulated: 10"
print(paste("  Down-regulated:", down_920_vs_320))
[1] "  Down-regulated: 5"

lenient

Model Diagnostics - All Samples

Before performing differential expression analysis.

# Convert to DGEList and calculate normalization factors
dge <- DGEList(counts = counts_filt_alt %>% column_to_rownames(var="gene_id"), samples = sample_info)
Setting first column of `counts` as gene annotation.
dge <- calcNormFactors(dge)

# Estimate dispersion and create dispersion plot
dge <- estimateDisp(dge, design=model.matrix(~ pool_size, data = sample_info))

Statistical Analysis - All Samples

We use limma-trend for differential expression analysis, which is appropriate for RNA-seq data.

# Create design matrix
design <- model.matrix(~ 0 + pool_size, data = sample_info)
colnames(design) <- levels(sample_info$pool_size)

# Estimate mean-variance trend and fit linear model
logCPM <- cpm(dge, log = TRUE, prior.count = 1)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend = TRUE)

# Define contrasts for pairwise comparisons
contrasts <- makeContrasts(
  pool320_vs_pool080 = pool320 - pool080,
  pool920_vs_pool080 = pool920 - pool080,
  pool920_vs_pool320 = pool920 - pool320,
  levels = design
)

# Fit contrasts
fit_contrasts <- contrasts.fit(fit, contrasts)
fit_contrasts <- eBayes(fit_contrasts, trend = TRUE)

# Extract results for each comparison with lenient suffix
results_320_vs_080_lenient <- topTable(fit_contrasts, coef = "pool320_vs_pool080", 
                                       number = Inf, sort.by = "P")
results_920_vs_080_lenient <- topTable(fit_contrasts, coef = "pool920_vs_pool080", 
                                       number = Inf, sort.by = "P")
results_920_vs_320_lenient <- topTable(fit_contrasts, coef = "pool920_vs_pool320", 
                                       number = Inf, sort.by = "P")

Results Summary - Lenient Filtering

# Summary for 320ng vs 080ng
sig_320_vs_080 <- results_320_vs_080_lenient[results_320_vs_080_lenient$adj.P.Val < 0.05,]
up_320_vs_080 <- sum(sig_320_vs_080$logFC > 0)
down_320_vs_080 <- sum(sig_320_vs_080$logFC < 0)
print(paste("320ng vs 080ng:", nrow(sig_320_vs_080), "DE genes"))
[1] "320ng vs 080ng: 4 DE genes"
print(paste("  Up-regulated:", up_320_vs_080))
[1] "  Up-regulated: 4"
print(paste("  Down-regulated:", down_320_vs_080))
[1] "  Down-regulated: 0"
# Summary for 920ng vs 080ng
sig_920_vs_080_summary <- results_920_vs_080_lenient[results_920_vs_080_lenient$adj.P.Val < 0.05,]
up_920_vs_080 <- sum(sig_920_vs_080_summary$logFC > 0)
down_920_vs_080 <- sum(sig_920_vs_080_summary$logFC < 0)
print(paste("920ng vs 080ng:", nrow(sig_920_vs_080_summary), "DE genes"))
[1] "920ng vs 080ng: 37 DE genes"
print(paste("  Up-regulated:", up_920_vs_080))
[1] "  Up-regulated: 29"
print(paste("  Down-regulated:", down_920_vs_080))
[1] "  Down-regulated: 8"
# Summary for 920ng vs 320ng
sig_920_vs_320 <- results_920_vs_320_lenient[results_920_vs_320_lenient$adj.P.Val < 0.05,]
up_920_vs_320 <- sum(sig_920_vs_320$logFC > 0)
down_920_vs_320 <- sum(sig_920_vs_320$logFC < 0)
print(paste("920ng vs 320ng:", nrow(sig_920_vs_320), "DE genes"))
[1] "920ng vs 320ng: 29 DE genes"
print(paste("  Up-regulated:", up_920_vs_320))
[1] "  Up-regulated: 23"
print(paste("  Down-regulated:", down_920_vs_320))
[1] "  Down-regulated: 6"

Analysis with Top 8 Samples

This section analyzes only the top 8 samples per condition (highest UMI counts) with different filtering strategies to address potential library size confounding effects observed in PCA.

Sample Selection Strategy

Select the top 8 samples with highest library sizes from each pool size condition.

# Calculate library sizes for all samples
lib_sizes_all <- colSums(counts[,-c(1:2)])
lib_df_all <- data.frame(
  sample = names(lib_sizes_all),
  lib_size = lib_sizes_all,
  pool_size = sample_info$pool_size[match(names(lib_sizes_all), sample_info$sample)]
)

# Select top 8 samples per condition
top8_samples <- lib_df_all %>%
  group_by(pool_size) %>%
  top_n(8, lib_size) %>%
  arrange(pool_size, desc(lib_size)) %>%
  ungroup()

print("Top 8 samples per condition by library size:")
[1] "Top 8 samples per condition by library size:"
knitr::kable(top8_samples, caption = "Top 8 samples per condition selected for analysis")
Top 8 samples per condition selected for analysis
sample lib_size pool_size
pool080_GTACAAGACGTG 196486 pool080
pool080_TGGTGGCGCTTA 121620 pool080
pool080_TATTCCTGTAGG 120800 pool080
pool080_CTGGCGTTGCTA 117406 pool080
pool080_TGGCGTAACCTC 114864 pool080
pool080_ATAGAGTACTCC 106897 pool080
pool080_TGCTACCAACAA 106764 pool080
pool080_CAGACAGCATAA 77162 pool080
pool320_CTGGCGTTGCTA 155582 pool320
pool320_TATTCCTGTAGG 131051 pool320
pool320_TGGCGTAACCTC 122014 pool320
pool320_CAGACAGCATAA 121307 pool320
pool320_TGGTGGCGCTTA 118451 pool320
pool320_GTACAAGACGTG 114927 pool320
pool320_ATAGAGTACTCC 100200 pool320
pool320_ATGGTCCACTCG 94998 pool320
pool920_CTGGCGTTGCTA 339521 pool920
pool920_TATTCCTGTAGG 313842 pool920
pool920_GTACAAGACGTG 284091 pool920
pool920_TGCTACCAACAA 282487 pool920
pool920_CAGACAGCATAA 280298 pool920
pool920_ATAGAGTACTCC 261924 pool920
pool920_TGGTGGCGCTTA 249454 pool920
pool920_TGGCGTAACCTC 207290 pool920
# Filter counts for top 8 samples (use original unfiltered counts)
selected_samples <- top8_samples$sample
counts_top8_unfiltered <- counts %>%
  dplyr::select(gene_id, gene_name, all_of(selected_samples))

sample_info_top8 <- sample_info %>%
  filter(sample %in% selected_samples) %>%
  mutate(pool_size = factor(pool_size, levels = c("pool080", "pool320", "pool920")))

print(paste("Top 8 samples dataset:", nrow(counts_top8_unfiltered), "genes,", ncol(counts_top8_unfiltered)-2, "samples"))
[1] "Top 8 samples dataset: 24311 genes, 24 samples"
print(paste("Sample distribution: pool080 =", sum(sample_info_top8$pool_size == "pool080"),
            ", pool320 =", sum(sample_info_top8$pool_size == "pool320"),
            ", pool920 =", sum(sample_info_top8$pool_size == "pool920")))
[1] "Sample distribution: pool080 = 8 , pool320 = 8 , pool920 = 8"
## Downsample to match library sizes

# Calculate target library size (min of pool080 and pool320 top 8 samples)
pool080_lib_sizes <- colSums(counts_top8_unfiltered[, sample_info_top8$sample[sample_info_top8$pool_size == "pool080"]])
pool320_lib_sizes <- colSums(counts_top8_unfiltered[, sample_info_top8$sample[sample_info_top8$pool_size == "pool320"]])
target_lib_size <- min(c(pool080_lib_sizes, pool320_lib_sizes))

print(paste("Target library size for downsampling:", round(target_lib_size, 0)))
[1] "Target library size for downsampling: 77162"
print(paste("Pool080 min library size:", round(min(pool080_lib_sizes), 0)))
[1] "Pool080 min library size: 77162"
print(paste("Pool320 min library size:", round(min(pool320_lib_sizes), 0)))
[1] "Pool320 min library size: 94998"
print(paste("Pool920 min library size:", round(min(pool320_lib_sizes), 0)))
[1] "Pool920 min library size: 94998"
# Get pool920 samples and their current library sizes
lib_sizes <- colSums(counts_top8_unfiltered[,-c(1,2)])

print("library sizes before downsampling:")
[1] "library sizes before downsampling:"
print(round(lib_sizes, 0))
pool080_GTACAAGACGTG pool080_TGGTGGCGCTTA pool080_TATTCCTGTAGG 
              196486               121620               120800 
pool080_CTGGCGTTGCTA pool080_TGGCGTAACCTC pool080_ATAGAGTACTCC 
              117406               114864               106897 
pool080_TGCTACCAACAA pool080_CAGACAGCATAA pool320_CTGGCGTTGCTA 
              106764                77162               155582 
pool320_TATTCCTGTAGG pool320_TGGCGTAACCTC pool320_CAGACAGCATAA 
              131051               122014               121307 
pool320_TGGTGGCGCTTA pool320_GTACAAGACGTG pool320_ATAGAGTACTCC 
              118451               114927               100200 
pool320_ATGGTCCACTCG pool920_CTGGCGTTGCTA pool920_TATTCCTGTAGG 
               94998               339521               313842 
pool920_GTACAAGACGTG pool920_TGCTACCAACAA pool920_CAGACAGCATAA 
              284091               282487               280298 
pool920_ATAGAGTACTCC pool920_TGGTGGCGCTTA pool920_TGGCGTAACCTC 
              261924               249454               207290 
# Downsample
set.seed(123) # For reproducibility
counts_top8_downsampled <- counts_top8_unfiltered

for(sample_name in sample_info_top8$sample) {
  current_lib_size <- lib_sizes[sample_name]
  
  if(current_lib_size > target_lib_size) {
    # Calculate downsampling probability
    downsample_prob <- target_lib_size / current_lib_size
    
    # Get count vector for this sample (excluding gene_id and gene_name columns)
    sample_counts <- counts_top8_unfiltered[[sample_name]]
    
    # Downsample using binomial sampling
    downsampled_counts <- rbinom(n = length(sample_counts), 
                                 size = sample_counts, 
                                 prob = downsample_prob)
    
    # Update the counts matrix
    counts_top8_downsampled[[sample_name]] <- downsampled_counts
    
    cat("Sample", sample_name, ": downsampled from", current_lib_size, "to", sum(downsampled_counts), "UMIs\n")
  } else {
    cat("Sample", sample_name, ": no downsampling needed (", current_lib_size, "UMIs)\n")
  }
}
Sample pool080_ATAGAGTACTCC : downsampled from 106897 to 77342 UMIs
Sample pool080_CAGACAGCATAA : no downsampling needed ( 77162 UMIs)
Sample pool080_CTGGCGTTGCTA : downsampled from 117406 to 77403 UMIs
Sample pool080_GTACAAGACGTG : downsampled from 196486 to 76849 UMIs
Sample pool080_TATTCCTGTAGG : downsampled from 120800 to 77158 UMIs
Sample pool080_TGCTACCAACAA : downsampled from 106764 to 77104 UMIs
Sample pool080_TGGCGTAACCTC : downsampled from 114864 to 77317 UMIs
Sample pool080_TGGTGGCGCTTA : downsampled from 121620 to 77060 UMIs
Sample pool320_ATAGAGTACTCC : downsampled from 100200 to 77158 UMIs
Sample pool320_ATGGTCCACTCG : downsampled from 94998 to 77137 UMIs
Sample pool320_CAGACAGCATAA : downsampled from 121307 to 77229 UMIs
Sample pool320_CTGGCGTTGCTA : downsampled from 155582 to 76992 UMIs
Sample pool320_GTACAAGACGTG : downsampled from 114927 to 77181 UMIs
Sample pool320_TATTCCTGTAGG : downsampled from 131051 to 77466 UMIs
Sample pool320_TGGCGTAACCTC : downsampled from 122014 to 77176 UMIs
Sample pool320_TGGTGGCGCTTA : downsampled from 118451 to 77272 UMIs
Sample pool920_ATAGAGTACTCC : downsampled from 261924 to 77166 UMIs
Sample pool920_CAGACAGCATAA : downsampled from 280298 to 77486 UMIs
Sample pool920_CTGGCGTTGCTA : downsampled from 339521 to 77235 UMIs
Sample pool920_GTACAAGACGTG : downsampled from 284091 to 77546 UMIs
Sample pool920_TATTCCTGTAGG : downsampled from 313842 to 76926 UMIs
Sample pool920_TGCTACCAACAA : downsampled from 282487 to 77385 UMIs
Sample pool920_TGGCGTAACCTC : downsampled from 207290 to 76876 UMIs
Sample pool920_TGGTGGCGCTTA : downsampled from 249454 to 76825 UMIs
# Verify downsampling results
lib_sizes_after <- colSums(counts_top8_downsampled[,-c(1,2)])
print("\nLibrary sizes after downsampling:")
[1] "\nLibrary sizes after downsampling:"
print("Pool080:")
[1] "Pool080:"
print(round(colSums(counts_top8_downsampled[, sample_info_top8$sample[sample_info_top8$pool_size == "pool080"]]), 0))
pool080_ATAGAGTACTCC pool080_CAGACAGCATAA pool080_CTGGCGTTGCTA 
               77342                77162                77403 
pool080_GTACAAGACGTG pool080_TATTCCTGTAGG pool080_TGCTACCAACAA 
               76849                77158                77104 
pool080_TGGCGTAACCTC pool080_TGGTGGCGCTTA 
               77317                77060 
print("Pool320:")
[1] "Pool320:"
print(round(colSums(counts_top8_downsampled[, sample_info_top8$sample[sample_info_top8$pool_size == "pool320"]]), 0))
pool320_ATAGAGTACTCC pool320_ATGGTCCACTCG pool320_CAGACAGCATAA 
               77158                77137                77229 
pool320_CTGGCGTTGCTA pool320_GTACAAGACGTG pool320_TATTCCTGTAGG 
               76992                77181                77466 
pool320_TGGCGTAACCTC pool320_TGGTGGCGCTTA 
               77176                77272 
print("Pool920:")
[1] "Pool920:"
print(round(colSums(counts_top8_downsampled[, sample_info_top8$sample[sample_info_top8$pool_size == "pool920"]]), 0))
pool920_ATAGAGTACTCC pool920_CAGACAGCATAA pool920_CTGGCGTTGCTA 
               77166                77486                77235 
pool920_GTACAAGACGTG pool920_TATTCCTGTAGG pool920_TGCTACCAACAA 
               77546                76926                77385 
pool920_TGGCGTAACCTC pool920_TGGTGGCGCTTA 
               76876                76825 
# Use downsampled counts for further analysis
counts_top8_unfiltered <- counts_top8_downsampled

Quality Control - Top 8 Samples

Library Size Distribution - Top 8 Samples

lib_sizes_top8_all <- colSums(counts_top8_unfiltered[,-c(1:2)])
lib_df_top8_all <- data.frame(
  sample = names(lib_sizes_top8_all),
  lib_size = lib_sizes_top8_all,
  pool_size = sample_info_top8$pool_size
)

ggplot(lib_df_top8_all, aes(x = sample, y = lib_size, fill = pool_size)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Library sizes - Top 8 samples per condition (after downsampling)", 
       x = "Sample", y = "UMI counts", fill = "Pool size") +
  scale_fill_brewer(type = "qual", palette = "Set1") +
  scale_y_continuous(labels = comma)

# Summary statistics
cat("Library size summary statistics after downsampling:\n")
Library size summary statistics after downsampling:
lib_summary <- lib_df_top8_all %>%
  group_by(pool_size) %>%
  summarise(
    median = median(lib_size),
    mean = mean(lib_size),
    min = min(lib_size),
    max = max(lib_size),
    .groups = 'drop'
  )
knitr::kable(lib_summary, caption = "Library size statistics by pool size (after downsampling)")
Library size statistics by pool size (after downsampling)
pool_size median mean min max
pool080 77160.0 77174.38 76849 77403
pool320 77178.5 77201.38 76992 77466
pool920 77200.5 77180.62 76825 77546

PCA Before Filtering - Top 8 Samples

count_matrix_top8_raw <- as.matrix(counts_top8_unfiltered[,-c(1:2)])
rownames(count_matrix_top8_raw) <- counts_top8_unfiltered$gene_id
logcpm_top8_raw <- cpm(count_matrix_top8_raw, log=TRUE, prior.count=1)

# Remove genes with low variance for PCA
var_genes_top8_raw <- apply(logcpm_top8_raw, 1, var)
high_var_genes_top8_raw <- order(var_genes_top8_raw, decreasing=TRUE)[1:min(1000, nrow(logcpm_top8_raw))]
pca_top8_raw <- prcomp(t(logcpm_top8_raw[high_var_genes_top8_raw,]), scale=TRUE)

pca_df_top8_raw <- data.frame(
  PC1 = pca_top8_raw$x[,1],
  PC2 = pca_top8_raw$x[,2],
  sample = rownames(pca_top8_raw$x),
  pool_size = sample_info_top8$pool_size,
  lib_size = lib_sizes_top8_all[rownames(pca_top8_raw$x)]
)

pc1_var_top8_raw <- round(summary(pca_top8_raw)$importance[2,1]*100, 1)
pc2_var_top8_raw <- round(summary(pca_top8_raw)$importance[2,2]*100, 1)

# PCA colored by pool size
ggplot(pca_df_top8_raw, aes(x = PC1, y = PC2, color = pool_size)) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(title = "PCA before filtering - Top 8 samples (colored by pool size)\n downsampled to match library sizes",
       x = paste0("PC1 (", pc1_var_top8_raw, "%)"),
       y = paste0("PC2 (", pc2_var_top8_raw, "%)"),
       color = "Pool size") +
  scale_color_brewer(type = "qual", palette = "Set1")

# PCA colored by library size
ggplot(pca_df_top8_raw, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
  geom_point(size = 4) +
  theme_minimal() +
  labs(title = "PCA before filtering - Top 8 samples (colored by library size)\n downsampled to match library sizes",
       x = paste0("PC1 (", pc1_var_top8_raw, "%)"),
       y = paste0("PC2 (", pc2_var_top8_raw, "%)"),
       color = "Library size",
       shape = "Pool size") +
  scale_color_gradient(low = "blue", high = "red", labels = comma) +
  scale_shape_manual(values = c(16, 17, 18))

Filtering - Top 8 Samples

We apply multiple filtering steps to remove uninformative genes:

  1. Sparse genes: Remove genes with low expression (CPM > 3 in >= 5 samples)
  2. optional Highly expressed genes: Remove top 10 most highly expressed genes (likely housekeeping genes)
  3. optional Mitochondrial genes: Remove mitochondrial genes
  4. optional Ribosomal RNA genes: Remove rRNA and rRNA pseudogenes
# Step 1: Filter sparse genes CPM>3 in >= 5 samples (for 24 samples)
count_matrix_top8_unfiltered <- as.matrix(counts_top8_unfiltered[,-c(1:2)])
rownames(count_matrix_top8_unfiltered) <- counts_top8_unfiltered$gene_id
cpm_values_top8 <- cpm(count_matrix_top8_unfiltered)
count_matrix_top8_zero <- count_matrix_top8_unfiltered
count_matrix_top8_zero[cpm_values_top8 < 3] <- 0
keep_sparse_top8 <- rowSums(count_matrix_top8_zero > 1) >= 5

print(paste("Genes before sparse filtering (top 8 samples):", nrow(counts_top8_unfiltered)))
[1] "Genes before sparse filtering (top 8 samples): 24311"
print(paste("Removed sparse genes (top 8 samples):", sum(!keep_sparse_top8)))
[1] "Removed sparse genes (top 8 samples): 13960"
# Step 2: Filter most highly expressed genes (remove top 10)
sum_expression_top8 <- rowSums(count_matrix_top8_unfiltered)
gene_expression_top8 <- tibble(
  gene_id = counts_top8_unfiltered$gene_id,
  gene_name = counts_top8_unfiltered$gene_name,
  total_expression = sum_expression_top8,
  perc = total_expression/(sum(count_matrix_top8_unfiltered))*100
)

top_genes_top8 <- gene_expression_top8 %>%
  arrange(desc(total_expression)) %>%
  head(10)

print("Top 10 genes by total expression (top 8 samples):")
[1] "Top 10 genes by total expression (top 8 samples):"
knitr::kable(top_genes_top8, caption = "Top 10 genes by total expression - Top 8 samples")
Top 10 genes by total expression - Top 8 samples
gene_id gene_name total_expression perc
ENSMUSG00000064339.1 mt-Rnr2 58311 3.1477756
ENSMUSG00000101249.2 Gm29216 44755 2.4159883
ENSMUSG00000064337.1 mt-Rnr1 42655 2.3026250
ENSMUSG00000119584.1 Rn18s-rs5 23552 1.2713967
ENSMUSG00000100862.2 Gm10925 19462 1.0506081
ENSMUSG00000064370.1 mt-Cytb 18945 1.0226991
ENSMUSG00000101111.2 Gm28437 16042 0.8659878
ENSMUSG00000064363.1 mt-Nd4 15631 0.8438010
ENSMUSG00000102070.2 Gm28661 14834 0.8007769
ENSMUSG00000064341.1 mt-Nd1 13212 0.7132172
# Step 3: Filter ribo and mito genes (same as before)
mitochondrial_genes <- all_genes %>%
  filter(seqnames == "chrM") %>%
  pull(gene_id)
rrna_genes <- all_genes %>%
  filter(gene_type %in% c("rRNA","rRNA_pseudogene")) %>%
  pull(gene_id)

# Apply all filters
counts_top8_stringent <- counts_top8_unfiltered[keep_sparse_top8,] %>%
  filter(!(gene_id %in% c(top_genes_top8$gene_id,
                          mitochondrial_genes,
                          rrna_genes)))

counts_top8_lenient <- counts_top8_unfiltered[keep_sparse_top8,]

print(paste("Original dataset (top 8 samples):", nrow(counts_top8_unfiltered), "genes"))
[1] "Original dataset (top 8 samples): 24311 genes"
print(paste("Stringent filtered dataset (top 8 samples):", nrow(counts_top8_stringent), "genes"))
[1] "Stringent filtered dataset (top 8 samples): 10330 genes"
print(paste("Stringent filtered dataset (top 8 samples):", nrow(counts_top8_lenient), "genes"))
[1] "Stringent filtered dataset (top 8 samples): 10351 genes"

PCA after Filtering - Top 8 Samples

stringent

count_matrix_top8 <- as.matrix(counts_top8_stringent[,-c(1:2)])
rownames(count_matrix_top8) <- counts_top8_stringent$gene_id
logcpm_top8 <- cpm(count_matrix_top8, log=TRUE, prior.count=1)

# Remove genes with low variance for PCA
var_genes_top8 <- apply(logcpm_top8, 1, var)
high_var_genes_top8 <- order(var_genes_top8, decreasing=TRUE)[1:min(1000, nrow(logcpm_top8))]
pca_top8 <- prcomp(t(logcpm_top8[high_var_genes_top8,]), scale=TRUE)

pca_df_top8 <- data.frame(
  PC1 = pca_top8$x[,1],
  PC2 = pca_top8$x[,2],
  sample = rownames(pca_top8$x),
  pool_size = sample_info_top8$pool_size,
  lib_size = lib_sizes_top8_all[rownames(pca_top8$x)]
)

pc1_var_top8 <- round(summary(pca_top8)$importance[2,1]*100, 1)
pc2_var_top8 <- round(summary(pca_top8)$importance[2,2]*100, 1)

# PCA colored by pool size
ggplot(pca_df_top8, aes(x = PC1, y = PC2, color = pool_size)) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(title = "PCA with top 8 samples per condition (colored by pool size)\n downsampled to match library sizes",
       x = paste0("PC1 (", pc1_var_top8, "%)"),
       y = paste0("PC2 (", pc2_var_top8, "%)"),
       color = "Pool size") +
  scale_color_brewer(type = "qual", palette = "Set1")

# PCA colored by library size
ggplot(pca_df_top8, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
  geom_point(size = 4) +
  theme_minimal() +
  labs(title = "PCA with top 8 samples per condition (colored by library size)\n
       downsampled to match library sizes",
       x = paste0("PC1 (", pc1_var_top8, "%)"),
       y = paste0("PC2 (", pc2_var_top8, "%)"),
       color = "Library size",
       shape = "Pool size") +
  scale_color_gradient(low = "blue", high = "red", labels = comma) +
  scale_shape_manual(values = c(16, 17, 18))

lenient

count_matrix_top8 <- as.matrix(counts_top8_lenient[,-c(1:2)])
rownames(count_matrix_top8) <- counts_top8_lenient$gene_id
logcpm_top8 <- cpm(count_matrix_top8, log=TRUE, prior.count=1)

# Remove genes with low variance for PCA
var_genes_top8 <- apply(logcpm_top8, 1, var)
high_var_genes_top8 <- order(var_genes_top8, decreasing=TRUE)[1:min(1000, nrow(logcpm_top8))]
pca_top8 <- prcomp(t(logcpm_top8[high_var_genes_top8,]), scale=TRUE)

pca_df_top8 <- data.frame(
  PC1 = pca_top8$x[,1],
  PC2 = pca_top8$x[,2],
  sample = rownames(pca_top8$x),
  pool_size = sample_info_top8$pool_size,
  lib_size = lib_sizes_top8_all[rownames(pca_top8$x)]
)

pc1_var_top8 <- round(summary(pca_top8)$importance[2,1]*100, 1)
pc2_var_top8 <- round(summary(pca_top8)$importance[2,2]*100, 1)

# PCA colored by pool size
ggplot(pca_df_top8, aes(x = PC1, y = PC2, color = pool_size)) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(title = "PCA with top 8 samples per condition (colored by pool size)\n downsampled to match library sizes",
       x = paste0("PC1 (", pc1_var_top8, "%)"),
       y = paste0("PC2 (", pc2_var_top8, "%)"),
       color = "Pool size") +
  scale_color_brewer(type = "qual", palette = "Set1")

# PCA colored by library size
ggplot(pca_df_top8, aes(x = PC1, y = PC2, color = lib_size, shape = pool_size)) +
  geom_point(size = 4) +
  theme_minimal() +
  labs(title = "PCA with top 8 samples per condition (colored by library size)\n downsampled to match library sizes",
       x = paste0("PC1 (", pc1_var_top8, "%)"),
       y = paste0("PC2 (", pc2_var_top8, "%)"),
       color = "Library size",
       shape = "Pool size") +
  scale_color_gradient(low = "blue", high = "red", labels = comma) +
  scale_shape_manual(values = c(16, 17, 18))

DEG - Top 8 Samples

stringent

Model Diagnostics - All Samples

Before performing differential expression analysis.

# Convert to DGEList and calculate normalization factors
dge <- DGEList(counts = counts_top8_stringent %>% column_to_rownames(var="gene_id"), samples = sample_info_top8)
Setting first column of `counts` as gene annotation.
dge <- calcNormFactors(dge)

# Estimate dispersion and create dispersion plot
dge <- estimateDisp(dge, design=model.matrix(~ pool_size, data = sample_info_top8))

Statistical Analysis - All Samples

We use limma-trend for differential expression analysis, which is appropriate for RNA-seq data.

# Create design matrix
design <- model.matrix(~ 0 + pool_size, data = sample_info_top8)
colnames(design) <- levels(sample_info_top8$pool_size)

# Estimate mean-variance trend and fit linear model
logCPM <- cpm(dge, log = TRUE, prior.count = 1)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend = TRUE)

# Define contrasts for pairwise comparisons
contrasts <- makeContrasts(
  pool320_vs_pool080 = pool320 - pool080,
  pool920_vs_pool080 = pool920 - pool080,
  pool920_vs_pool320 = pool920 - pool320,
  levels = design
)

# Fit contrasts
fit_contrasts <- contrasts.fit(fit, contrasts)
fit_contrasts <- eBayes(fit_contrasts, trend = TRUE)

# Extract results for each comparison with top8 stringent suffix
results_320_vs_080_top8_stringent <- topTable(fit_contrasts, coef = "pool320_vs_pool080", 
                                              number = Inf, sort.by = "P")
results_920_vs_080_top8_stringent <- topTable(fit_contrasts, coef = "pool920_vs_pool080", 
                                              number = Inf, sort.by = "P")
results_920_vs_320_top8_stringent <- topTable(fit_contrasts, coef = "pool920_vs_pool320", 
                                              number = Inf, sort.by = "P")

Results Summary - Stringent Filtering

# Summary for 320ng vs 080ng
sig_320_vs_080 <- results_320_vs_080_top8_stringent[results_320_vs_080_top8_stringent$adj.P.Val < 0.05,]
up_320_vs_080 <- sum(sig_320_vs_080$logFC > 0)
down_320_vs_080 <- sum(sig_320_vs_080$logFC < 0)
print(paste("320ng vs 080ng:", nrow(sig_320_vs_080), "DE genes"))
[1] "320ng vs 080ng: 7 DE genes"
print(paste("  Up-regulated:", up_320_vs_080))
[1] "  Up-regulated: 6"
print(paste("  Down-regulated:", down_320_vs_080))
[1] "  Down-regulated: 1"
# Summary for 920ng vs 080ng
sig_920_vs_080_summary <- results_920_vs_080_top8_stringent[results_920_vs_080_top8_stringent$adj.P.Val < 0.05,]
up_920_vs_080 <- sum(sig_920_vs_080_summary$logFC > 0)
down_920_vs_080 <- sum(sig_920_vs_080_summary$logFC < 0)
print(paste("920ng vs 080ng:", nrow(sig_920_vs_080_summary), "DE genes"))
[1] "920ng vs 080ng: 100 DE genes"
print(paste("  Up-regulated:", up_920_vs_080))
[1] "  Up-regulated: 63"
print(paste("  Down-regulated:", down_920_vs_080))
[1] "  Down-regulated: 37"
# Summary for 920ng vs 320ng
sig_920_vs_320 <- results_920_vs_320_top8_stringent[results_920_vs_320_top8_stringent$adj.P.Val < 0.05,]
up_920_vs_320 <- sum(sig_920_vs_320$logFC > 0)
down_920_vs_320 <- sum(sig_920_vs_320$logFC < 0)
print(paste("920ng vs 320ng:", nrow(sig_920_vs_320), "DE genes"))
[1] "920ng vs 320ng: 35 DE genes"
print(paste("  Up-regulated:", up_920_vs_320))
[1] "  Up-regulated: 21"
print(paste("  Down-regulated:", down_920_vs_320))
[1] "  Down-regulated: 14"

lenient

Model Diagnostics - All Samples

Before performing differential expression analysis.

# Convert to DGEList and calculate normalization factors
dge <- DGEList(counts = counts_top8_lenient %>% column_to_rownames(var="gene_id"), samples = sample_info_top8)
Setting first column of `counts` as gene annotation.
dge <- calcNormFactors(dge)

# Estimate dispersion and create dispersion plot
dge <- estimateDisp(dge, design=model.matrix(~ pool_size, data = sample_info_top8))

Statistical Analysis - All Samples

We use limma-trend for differential expression analysis, which is appropriate for RNA-seq data.

# Create design matrix
design <- model.matrix(~ 0 + pool_size, data = sample_info_top8)
colnames(design) <- levels(sample_info_top8$pool_size)

# Estimate mean-variance trend and fit linear model
logCPM <- cpm(dge, log = TRUE, prior.count = 1)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend = TRUE)

# Define contrasts for pairwise comparisons
contrasts <- makeContrasts(
  pool320_vs_pool080 = pool320 - pool080,
  pool920_vs_pool080 = pool920 - pool080,
  pool920_vs_pool320 = pool920 - pool320,
  levels = design
)

# Fit contrasts
fit_contrasts <- contrasts.fit(fit, contrasts)
fit_contrasts <- eBayes(fit_contrasts, trend = TRUE)

# Extract results for each comparison with top8 lenient suffix
results_320_vs_080_top8_lenient <- topTable(fit_contrasts, coef = "pool320_vs_pool080", 
                                            number = Inf, sort.by = "P")
results_920_vs_080_top8_lenient <- topTable(fit_contrasts, coef = "pool920_vs_pool080", 
                                            number = Inf, sort.by = "P")
results_920_vs_320_top8_lenient <- topTable(fit_contrasts, coef = "pool920_vs_pool320", 
                                            number = Inf, sort.by = "P")

Results Summary - Lenient Filtering

# Summary for 320ng vs 080ng
sig_320_vs_080 <- results_320_vs_080_top8_lenient[results_320_vs_080_top8_lenient$adj.P.Val < 0.05,]
up_320_vs_080 <- sum(sig_320_vs_080$logFC > 0)
down_320_vs_080 <- sum(sig_320_vs_080$logFC < 0)
print(paste("320ng vs 080ng:", nrow(sig_320_vs_080), "DE genes"))
[1] "320ng vs 080ng: 13 DE genes"
print(paste("  Up-regulated:", up_320_vs_080))
[1] "  Up-regulated: 12"
print(paste("  Down-regulated:", down_320_vs_080))
[1] "  Down-regulated: 1"
# Summary for 920ng vs 080ng
sig_920_vs_080_summary <- results_920_vs_080_top8_lenient[results_920_vs_080_top8_lenient$adj.P.Val < 0.05,]
up_920_vs_080 <- sum(sig_920_vs_080_summary$logFC > 0)
down_920_vs_080 <- sum(sig_920_vs_080_summary$logFC < 0)
print(paste("920ng vs 080ng:", nrow(sig_920_vs_080_summary), "DE genes"))
[1] "920ng vs 080ng: 116 DE genes"
print(paste("  Up-regulated:", up_920_vs_080))
[1] "  Up-regulated: 78"
print(paste("  Down-regulated:", down_920_vs_080))
[1] "  Down-regulated: 38"
# Summary for 920ng vs 320ng
sig_920_vs_320 <- results_920_vs_320_top8_lenient[results_920_vs_320_top8_lenient$adj.P.Val < 0.05,]
up_920_vs_320 <- sum(sig_920_vs_320$logFC > 0)
down_920_vs_320 <- sum(sig_920_vs_320$logFC < 0)
print(paste("920ng vs 320ng:", nrow(sig_920_vs_320), "DE genes"))
[1] "920ng vs 320ng: 49 DE genes"
print(paste("  Up-regulated:", up_920_vs_320))
[1] "  Up-regulated: 34"
print(paste("  Down-regulated:", down_920_vs_320))
[1] "  Down-regulated: 15"

DE Gene Results Tables

All Samples - Stringent Filtering

320ng vs 080ng - All Samples Stringent

# Get significant genes and add gene names
sig_genes_320_vs_080_all_stringent <- results_320_vs_080 %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (All samples - stringent):", nrow(sig_genes_320_vs_080_all_stringent)))
[1] "Number of significant genes (All samples - stringent): 3"
if(nrow(sig_genes_320_vs_080_all_stringent) > 0) {
  DT::datatable(sig_genes_320_vs_080_all_stringent, 
                caption = "DE genes: 320ng vs 080ng (All samples - stringent filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

920ng vs 080ng - All Samples Stringent

sig_genes_920_vs_080_all_stringent <- results_920_vs_080 %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (All samples - stringent):", nrow(sig_genes_920_vs_080_all_stringent)))
[1] "Number of significant genes (All samples - stringent): 27"
if(nrow(sig_genes_920_vs_080_all_stringent) > 0) {
  DT::datatable(sig_genes_920_vs_080_all_stringent, 
                caption = "DE genes: 920ng vs 080ng (All samples - stringent filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

920ng vs 320ng - All Samples Stringent

sig_genes_920_vs_320_all_stringent <- results_920_vs_320 %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (All samples - stringent):", nrow(sig_genes_920_vs_320_all_stringent)))
[1] "Number of significant genes (All samples - stringent): 15"
if(nrow(sig_genes_920_vs_320_all_stringent) > 0) {
  DT::datatable(sig_genes_920_vs_320_all_stringent, 
                caption = "DE genes: 920ng vs 320ng (All samples - stringent filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

All Samples - Lenient Filtering

320ng vs 080ng - All Samples Lenient

sig_genes_320_vs_080_all_lenient <- results_320_vs_080_lenient %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (All samples - lenient):", nrow(sig_genes_320_vs_080_all_lenient)))
[1] "Number of significant genes (All samples - lenient): 4"
if(nrow(sig_genes_320_vs_080_all_lenient) > 0) {
  DT::datatable(sig_genes_320_vs_080_all_lenient, 
                caption = "DE genes: 320ng vs 080ng (All samples - lenient filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

920ng vs 080ng - All Samples Lenient

sig_genes_920_vs_080_all_lenient <- results_920_vs_080_lenient %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (All samples - lenient):", nrow(sig_genes_920_vs_080_all_lenient)))
[1] "Number of significant genes (All samples - lenient): 37"
if(nrow(sig_genes_920_vs_080_all_lenient) > 0) {
  DT::datatable(sig_genes_920_vs_080_all_lenient, 
                caption = "DE genes: 920ng vs 080ng (All samples - lenient filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

920ng vs 320ng - All Samples Lenient

sig_genes_920_vs_320_all_lenient <- results_920_vs_320_lenient %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (All samples - lenient):", nrow(sig_genes_920_vs_320_all_lenient)))
[1] "Number of significant genes (All samples - lenient): 29"
if(nrow(sig_genes_920_vs_320_all_lenient) > 0) {
  DT::datatable(sig_genes_920_vs_320_all_lenient, 
                caption = "DE genes: 920ng vs 320ng (All samples - lenient filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

Top 8 Samples - Stringent Filtering

320ng vs 080ng - Top 8 Stringent

sig_genes_320_vs_080_top8_stringent <- results_320_vs_080_top8_stringent %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (Top 8 samples - stringent):", nrow(sig_genes_320_vs_080_top8_stringent)))
[1] "Number of significant genes (Top 8 samples - stringent): 7"
if(nrow(sig_genes_320_vs_080_top8_stringent) > 0) {
  DT::datatable(sig_genes_320_vs_080_top8_stringent, 
                caption = "DE genes: 320ng vs 080ng (Top 8 samples - stringent filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

920ng vs 080ng - Top 8 Stringent

sig_genes_920_vs_080_top8_stringent <- results_920_vs_080_top8_stringent %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (Top 8 samples - stringent):", nrow(sig_genes_920_vs_080_top8_stringent)))
[1] "Number of significant genes (Top 8 samples - stringent): 100"
if(nrow(sig_genes_920_vs_080_top8_stringent) > 0) {
  DT::datatable(sig_genes_920_vs_080_top8_stringent, 
                caption = "DE genes: 920ng vs 080ng (Top 8 samples - stringent filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

920ng vs 320ng - Top 8 Stringent

sig_genes_920_vs_320_top8_stringent <- results_920_vs_320_top8_stringent %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (Top 8 samples - stringent):", nrow(sig_genes_920_vs_320_top8_stringent)))
[1] "Number of significant genes (Top 8 samples - stringent): 35"
if(nrow(sig_genes_920_vs_320_top8_stringent) > 0) {
  DT::datatable(sig_genes_920_vs_320_top8_stringent, 
                caption = "DE genes: 920ng vs 320ng (Top 8 samples - stringent filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

Top 8 Samples - Lenient Filtering

320ng vs 080ng - Top 8 Lenient

sig_genes_320_vs_080_top8_lenient <- results_320_vs_080_top8_lenient %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (Top 8 samples - lenient):", nrow(sig_genes_320_vs_080_top8_lenient)))
[1] "Number of significant genes (Top 8 samples - lenient): 13"
if(nrow(sig_genes_320_vs_080_top8_lenient) > 0) {
  DT::datatable(sig_genes_320_vs_080_top8_lenient, 
                caption = "DE genes: 320ng vs 080ng (Top 8 samples - lenient filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

920ng vs 080ng - Top 8 Lenient

sig_genes_920_vs_080_top8_lenient <- results_920_vs_080_top8_lenient %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (Top 8 samples - lenient):", nrow(sig_genes_920_vs_080_top8_lenient)))
[1] "Number of significant genes (Top 8 samples - lenient): 116"
if(nrow(sig_genes_920_vs_080_top8_lenient) > 0) {
  DT::datatable(sig_genes_920_vs_080_top8_lenient, 
                caption = "DE genes: 920ng vs 080ng (Top 8 samples - lenient filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

920ng vs 320ng - Top 8 Lenient

sig_genes_920_vs_320_top8_lenient <- results_920_vs_320_top8_lenient %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id") %>%
  dplyr::select(gene_id, gene_name, logFC, AveExpr, t, P.Value, adj.P.Val, B) %>%
  arrange(desc(abs(logFC)))

print(paste("Number of significant genes (Top 8 samples - lenient):", nrow(sig_genes_920_vs_320_top8_lenient)))
[1] "Number of significant genes (Top 8 samples - lenient): 49"
if(nrow(sig_genes_920_vs_320_top8_lenient) > 0) {
  DT::datatable(sig_genes_920_vs_320_top8_lenient, 
                caption = "DE genes: 920ng vs 320ng (Top 8 samples - lenient filtering, FDR < 0.05)",
                options = list(pageLength = 25, scrollX = TRUE, order = list(list(2, 'desc'))),
                filter = 'top') %>%
    DT::formatRound(columns = c('logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'), digits = 4)
} else {
  cat("No significant genes found.")
}

DE Gene Overlap Analysis

UpSet Plot of DE Gene Overlaps

This section visualizes the overlap of differentially expressed genes across all 12 analysis conditions using an UpSet plot.

# Extract significant gene IDs from all 12 conditions
de_gene_lists <- list(
  "All_Stringent_320v080" = rownames(results_320_vs_080[results_320_vs_080$adj.P.Val < 0.05, ]),
  "All_Stringent_920v080" = rownames(results_920_vs_080[results_920_vs_080$adj.P.Val < 0.05, ]),
  "All_Stringent_920v320" = rownames(results_920_vs_320[results_920_vs_320$adj.P.Val < 0.05, ]),
  
  "All_Lenient_320v080" = rownames(results_320_vs_080_lenient[results_320_vs_080_lenient$adj.P.Val < 0.05, ]),
  "All_Lenient_920v080" = rownames(results_920_vs_080_lenient[results_920_vs_080_lenient$adj.P.Val < 0.05, ]),
  "All_Lenient_920v320" = rownames(results_920_vs_320_lenient[results_920_vs_320_lenient$adj.P.Val < 0.05, ]),
  
  "Top8_Stringent_320v080" = rownames(results_320_vs_080_top8_stringent[results_320_vs_080_top8_stringent$adj.P.Val < 0.05, ]),
  "Top8_Stringent_920v080" = rownames(results_920_vs_080_top8_stringent[results_920_vs_080_top8_stringent$adj.P.Val < 0.05, ]),
  "Top8_Stringent_920v320" = rownames(results_920_vs_320_top8_stringent[results_920_vs_320_top8_stringent$adj.P.Val < 0.05, ]),
  
  "Top8_Lenient_320v080" = rownames(results_320_vs_080_top8_lenient[results_320_vs_080_top8_lenient$adj.P.Val < 0.05, ]),
  "Top8_Lenient_920v080" = rownames(results_920_vs_080_top8_lenient[results_920_vs_080_top8_lenient$adj.P.Val < 0.05, ]),
  "Top8_Lenient_920v320" = rownames(results_920_vs_320_top8_lenient[results_920_vs_320_top8_lenient$adj.P.Val < 0.05, ])
)

de_gene_lists <- list(
  "All_Stringent_320v080" = rownames(results_320_vs_080[results_320_vs_080$adj.P.Val < 0.05, ]),
  "All_Lenient_320v080" = rownames(results_320_vs_080_lenient[results_320_vs_080_lenient$adj.P.Val < 0.05, ]),
  "Top8_Stringent_320v080" = rownames(results_320_vs_080_top8_stringent[results_320_vs_080_top8_stringent$adj.P.Val < 0.05, ]),
  "Top8_Lenient_320v080" = rownames(results_320_vs_080_top8_lenient[results_320_vs_080_top8_lenient$adj.P.Val < 0.05, ]),
  
  "All_Stringent_920v080" = rownames(results_920_vs_080[results_920_vs_080$adj.P.Val < 0.05, ]),
  "All_Lenient_920v080" = rownames(results_920_vs_080_lenient[results_920_vs_080_lenient$adj.P.Val < 0.05, ]),
  "Top8_Stringent_920v080" = rownames(results_920_vs_080_top8_stringent[results_920_vs_080_top8_stringent$adj.P.Val < 0.05, ]),
  "Top8_Lenient_920v080" = rownames(results_920_vs_080_top8_lenient[results_920_vs_080_top8_lenient$adj.P.Val < 0.05, ]),
  
  "All_Stringent_920v320" = rownames(results_920_vs_320[results_920_vs_320$adj.P.Val < 0.05, ]),
  "All_Lenient_920v320" = rownames(results_920_vs_320_lenient[results_920_vs_320_lenient$adj.P.Val < 0.05, ]),
  "Top8_Stringent_920v320" = rownames(results_920_vs_320_top8_stringent[results_920_vs_320_top8_stringent$adj.P.Val < 0.05, ]),
  "Top8_Lenient_920v320" = rownames(results_920_vs_320_top8_lenient[results_920_vs_320_top8_lenient$adj.P.Val < 0.05, ])
)

# Print summary of DE genes per condition
cat("DE genes per condition (FDR < 0.05):\n")
DE genes per condition (FDR < 0.05):
sapply(de_gene_lists, length)
 All_Stringent_320v080    All_Lenient_320v080 Top8_Stringent_320v080 
                     3                      4                      7 
  Top8_Lenient_320v080  All_Stringent_920v080    All_Lenient_920v080 
                    13                     27                     37 
Top8_Stringent_920v080   Top8_Lenient_920v080  All_Stringent_920v320 
                   100                    116                     15 
   All_Lenient_920v320 Top8_Stringent_920v320   Top8_Lenient_920v320 
                    29                     35                     49 
# Create UpSet plot
upset(fromList(de_gene_lists), 
      order.by = "freq",
      nsets = 12,
      sets = names(de_gene_lists),
      keep.order = TRUE,
      nintersects = 30,
      sets.bar.color = "#56B4E9",
      main.bar.color = "#E69F00",
      matrix.color = "#E69F00",
      text.scale = c(1.3, 1.3, 1, 1, 2, 1.5),
      set_size.show = TRUE,
      set_size.scale_max = max(sapply(de_gene_lists, length)) * 1.1)

Summary of DE Gene Overlaps

# Create a summary table of overlaps
overlap_matrix <- fromList(de_gene_lists)

# Calculate total unique genes
total_unique_genes <- nrow(overlap_matrix)
cat("Total unique DE genes across all conditions:", total_unique_genes, "\n\n")
Total unique DE genes across all conditions: 133 
# Find genes that are DE in all contrasts for each analysis type
all_contrasts_all_stringent <- rownames(overlap_matrix)[
  overlap_matrix$All_Stringent_320v080 == 1 & 
    overlap_matrix$All_Stringent_920v080 == 1 & 
    overlap_matrix$All_Stringent_920v320 == 1]

all_contrasts_all_lenient <- rownames(overlap_matrix)[
  overlap_matrix$All_Lenient_320v080 == 1 & 
    overlap_matrix$All_Lenient_920v080 == 1 & 
    overlap_matrix$All_Lenient_920v320 == 1]

all_contrasts_top8_stringent <- rownames(overlap_matrix)[
  overlap_matrix$Top8_Stringent_320v080 == 1 & 
    overlap_matrix$Top8_Stringent_920v080 == 1 & 
    overlap_matrix$Top8_Stringent_920v320 == 1]

all_contrasts_top8_lenient <- rownames(overlap_matrix)[
  overlap_matrix$Top8_Lenient_320v080 == 1 & 
    overlap_matrix$Top8_Lenient_920v080 == 1 & 
    overlap_matrix$Top8_Lenient_920v320 == 1]

# Find genes that are consistently DE across all 12 conditions
consistent_genes <- rownames(overlap_matrix)[rowSums(overlap_matrix) == 12]

cat("Genes DE in all 3 contrasts per analysis type:\n")
Genes DE in all 3 contrasts per analysis type:
cat("All samples - stringent filtering:", length(all_contrasts_all_stringent), "genes\n")
All samples - stringent filtering: 0 genes
cat("All samples - lenient filtering:", length(all_contrasts_all_lenient), "genes\n")
All samples - lenient filtering: 1 genes
cat("Top 8 samples - stringent filtering:", length(all_contrasts_top8_stringent), "genes\n")
Top 8 samples - stringent filtering: 0 genes
cat("Top 8 samples - lenient filtering:", length(all_contrasts_top8_lenient), "genes\n\n")
Top 8 samples - lenient filtering: 6 genes
cat("Genes consistently DE across ALL 12 conditions:", length(consistent_genes), "genes\n")
Genes consistently DE across ALL 12 conditions: 0 genes
if(length(consistent_genes) > 0) {
  cat("Consistently DE genes:\n")
  consistent_genes_with_names <- genes[genes$gene_id %in% consistent_genes, c("gene_id", "gene_name")]
  knitr::kable(consistent_genes_with_names, caption = "Genes DE in all 12 analysis conditions")
}

Analysis-Specific Overlaps

# Create separate UpSet plots for different groupings
# All stringent
all_stringent_lists <- de_gene_lists[grepl("^All_.*Stringent", names(de_gene_lists))]

# All lenient
all_lenient_lists <- de_gene_lists[grepl("^All_.*Lenient", names(de_gene_lists))]

# Top 8 stringent
top8_stringent_lists <- de_gene_lists[grepl("^Top8_.*Stringent", names(de_gene_lists))]

# Top 8 lenient
top8_lenient_lists <- de_gene_lists[grepl("^Top8_.*Lenient", names(de_gene_lists))]

# 1. All samples (stringent vs lenient)
all_samples_lists <- de_gene_lists[grepl("^All_", names(de_gene_lists))]

# 2. Top 8 samples (stringent vs lenient)  
top8_samples_lists <- de_gene_lists[grepl("^Top8_", names(de_gene_lists))]

# 3. Stringent filtering (all vs top8)
stringent_lists <- de_gene_lists[grepl("Stringent", names(de_gene_lists))]

# 4. Lenient filtering (all vs top8)
lenient_lists <- de_gene_lists[grepl("Lenient", names(de_gene_lists))]


# All samples comparison
upset(fromList(all_samples_lists), 
      order.by = "freq",
      nsets = 6,
      nintersects = 20,
      sets.bar.color = "#56B4E9",
      main.bar.color = "#E69F00",
      matrix.color = "#E69F00",
      text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
      set_size.show = TRUE,
      mainbar.y.label = "Intersection Size",
      sets.x.label = "Set Size")

All stringent

upset(fromList(all_stringent_lists), 
      order.by = "freq",
      nsets = 6,
      nintersects = 20,
      sets.bar.color = "#56B4E9",
      main.bar.color = "#E69F00",
      matrix.color = "#E69F00",
      text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
      set_size.show = TRUE,
      mainbar.y.label = "Intersection Size",
      sets.x.label = "Set Size")

All lenient

upset(fromList(all_lenient_lists), 
      order.by = "freq",
      nsets = 6,
      nintersects = 20,
      sets.bar.color = "#56B4E9",
      main.bar.color = "#E69F00",
      matrix.color = "#E69F00",
      text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
      set_size.show = TRUE,
      mainbar.y.label = "Intersection Size",
      sets.x.label = "Set Size")

Top8 stringent

upset(fromList(top8_stringent_lists), 
      order.by = "freq",
      nsets = 6,
      nintersects = 20,
      sets.bar.color = "#56B4E9",
      main.bar.color = "#E69F00",
      matrix.color = "#E69F00",
      text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
      set_size.show = TRUE,
      mainbar.y.label = "Intersection Size",
      sets.x.label = "Set Size")

Top8 lenient

upset(fromList(top8_lenient_lists), 
      order.by = "freq",
      nsets = 6,
      nintersects = 20,
      sets.bar.color = "#56B4E9",
      main.bar.color = "#E69F00",
      matrix.color = "#E69F00",
      text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
      set_size.show = TRUE,
      mainbar.y.label = "Intersection Size",
      sets.x.label = "Set Size")

Top 8 Samples: Stringent vs Lenient Filtering

# Top 8 samples comparison
upset(fromList(top8_samples_lists), 
      order.by = "freq",
      nsets = 6,
      nintersects = 20,
      sets.bar.color = "#56B4E9",
      main.bar.color = "#E69F00",
      matrix.color = "#E69F00",
      text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
      set_size.show = TRUE,
      mainbar.y.label = "Intersection Size",
      sets.x.label = "Set Size")

Stringent Filtering: All vs Top 8 Samples

# Stringent filtering comparison
upset(fromList(stringent_lists), 
      order.by = "freq",
      nsets = 6,
      nintersects = 20,
      sets.bar.color = "#56B4E9",
      main.bar.color = "#E69F00",
      matrix.color = "#E69F00",
      text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
      set_size.show = TRUE,
      mainbar.y.label = "Intersection Size",
      sets.x.label = "Set Size")

Lenient Filtering: All vs Top 8 Samples

# Lenient filtering comparison
upset(fromList(lenient_lists), 
      order.by = "freq",
      nsets = 6,
      nintersects = 20,
      sets.bar.color = "#56B4E9",
      main.bar.color = "#E69F00",
      matrix.color = "#E69F00",
      text.scale = c(1.2, 1.2, 1, 1, 1.5, 1.2),
      set_size.show = TRUE,
      mainbar.y.label = "Intersection Size",
      sets.x.label = "Set Size")

Gene Set Enrichment Analysis

Configuration: Select DEG Dataset for GO Enrichment

# USER CONFIGURATION: Choose which DEG dataset to use for GO enrichment
# Options:
# 1. "all_stringent" - All samples with stringent filtering
# 2. "all_lenient" - All samples with lenient filtering  
# 3. "top8_stringent" - Top 8 samples with stringent filtering
# 4. "top8_lenient" - Top 8 samples with lenient filtering

selected_deg_dataset <- "top8_lenient"  # Change this to select different dataset

# USER CONFIGURATION: Choose GO category for enrichment analysis
# Options:
# - "BP" - Biological Process
# - "MF" - Molecular Function  
# - "CC" - Cellular Component

selected_go_category <- "MF"  # Change this to select different GO category

# Define the results objects based on selection
if(selected_deg_dataset == "all_stringent") {
  selected_results_320_080 <- results_320_vs_080
  selected_results_920_080 <- results_920_vs_080
  selected_results_920_320 <- results_920_vs_320
  dataset_description <- "All samples with stringent filtering"
} else if(selected_deg_dataset == "all_lenient") {
  selected_results_320_080 <- results_320_vs_080_lenient
  selected_results_920_080 <- results_920_vs_080_lenient
  selected_results_920_320 <- results_920_vs_320_lenient
  dataset_description <- "All samples with lenient filtering"
} else if(selected_deg_dataset == "top8_stringent") {
  selected_results_320_080 <- results_320_vs_080_top8_stringent
  selected_results_920_080 <- results_920_vs_080_top8_stringent
  selected_results_920_320 <- results_920_vs_320_top8_stringent
  dataset_description <- "Top 8 samples with stringent filtering"
} else if(selected_deg_dataset == "top8_lenient") {
  selected_results_320_080 <- results_320_vs_080_top8_lenient
  selected_results_920_080 <- results_920_vs_080_top8_lenient
  selected_results_920_320 <- results_920_vs_320_top8_lenient
  dataset_description <- "Top 8 samples with lenient filtering"
} else {
  stop("Invalid selection. Choose from: all_stringent, all_lenient, top8_stringent, top8_lenient")
}

# Define GO category description
if(selected_go_category == "BP") {
  go_description <- "Biological Process"
} else if(selected_go_category == "MF") {
  go_description <- "Molecular Function"
} else if(selected_go_category == "CC") {
  go_description <- "Cellular Component"
} else {
  stop("Invalid GO category. Choose from: BP, MF, CC")
}

cat("Selected DEG dataset for GO enrichment:", dataset_description, "\n")
Selected DEG dataset for GO enrichment: Top 8 samples with lenient filtering 
cat("Selected GO category:", go_description, "(", selected_go_category, ")\n")
Selected GO category: Molecular Function ( MF )
cat("Change variables above to use different dataset or GO category\n\n")
Change variables above to use different dataset or GO category

GO Enrichment Analysis - Selected Dataset

This section performs Gene Ontology (GO) enrichment analysis on the differentially expressed genes from the selected dataset: Top 8 samples with lenient filtering using Molecular Function terms.

Gene ID Conversion and Background

# Convert gene symbols to Entrez IDs for mouse
# First, let's check what type of gene IDs we have
head(genes)
               gene_id     gene_name
1 ENSMUSG00000102693.2 4933401J01Rik
2 ENSMUSG00000064842.3       Gm26206
3 ENSMUSG00000051951.6          Xkr4
4 ENSMUSG00000102851.2       Gm18956
5 ENSMUSG00000103377.2       Gm37180
6 ENSMUSG00000104017.2       Gm37363
# Create condition-specific expressed gene sets
# Define expressed genes for each pool size condition (mean CPM > 1 within condition)

# Get samples for each condition
pool080_samples <- sample_info$sample[sample_info$pool_size == "pool080"]
pool320_samples <- sample_info$sample[sample_info$pool_size == "pool320"]
pool920_samples <- sample_info$sample[sample_info$pool_size == "pool920"]

# Calculate condition-specific expression
count_matrix_lenient <- as.matrix(counts_filt_alt[,-c(1:2)])
rownames(count_matrix_lenient) <- counts_filt_alt$gene_id
cpm_lenient <- cpm(count_matrix_lenient)

# Get expressed genes per condition (mean CPM > 1 within condition)
expressed_pool080 <- rownames(cpm_lenient)[apply(cpm_lenient[, pool080_samples], 1, function(x) mean(x) > 1)]
expressed_pool320 <- rownames(cpm_lenient)[apply(cpm_lenient[, pool320_samples], 1, function(x) mean(x) > 1)]
expressed_pool920 <- rownames(cpm_lenient)[apply(cpm_lenient[, pool920_samples], 1, function(x) mean(x) > 1)]

# Create pairwise universes (genes expressed in both conditions of each contrast)
universe_320_080_genes <- intersect(expressed_pool320, expressed_pool080)
universe_920_080_genes <- intersect(expressed_pool920, expressed_pool080)
universe_920_320_genes <- intersect(expressed_pool920, expressed_pool320)

print(paste("Genes expressed in pool080:", length(expressed_pool080)))
[1] "Genes expressed in pool080: 12817"
print(paste("Genes expressed in pool320:", length(expressed_pool320)))
[1] "Genes expressed in pool320: 12837"
print(paste("Genes expressed in pool920:", length(expressed_pool920)))
[1] "Genes expressed in pool920: 12867"
print(paste("Universe 320v080 (genes expressed in both):", length(universe_320_080_genes)))
[1] "Universe 320v080 (genes expressed in both): 12789"
print(paste("Universe 920v080 (genes expressed in both):", length(universe_920_080_genes)))
[1] "Universe 920v080 (genes expressed in both): 12816"
print(paste("Universe 920v320 (genes expressed in both):", length(universe_920_320_genes)))
[1] "Universe 920v320 (genes expressed in both): 12835"
# Convert to gene names and then to Entrez IDs for each universe
get_entrez_universe <- function(gene_ids) {
  gene_names <- genes$gene_name[genes$gene_id %in% gene_ids]
  entrez_ids <- mapIds(org.Mm.eg.db, 
                       keys = gene_names, 
                       column = "ENTREZID", 
                       keytype = "SYMBOL",
                       multiVals = "first")
  return(entrez_ids[!is.na(entrez_ids)])
}

universe_320_080_entrez <- get_entrez_universe(universe_320_080_genes)
universe_920_080_entrez <- get_entrez_universe(universe_920_080_genes)
universe_920_320_entrez <- get_entrez_universe(universe_920_320_genes)

print(paste("Universe 320v080 with Entrez IDs:", length(universe_320_080_entrez)))
[1] "Universe 320v080 with Entrez IDs: 12292"
print(paste("Universe 920v080 with Entrez IDs:", length(universe_920_080_entrez)))
[1] "Universe 920v080 with Entrez IDs: 12314"
print(paste("Universe 920v320 with Entrez IDs:", length(universe_920_320_entrez)))
[1] "Universe 920v320 with Entrez IDs: 12322"

320ng vs 080ng GO Enrichment

# Get significant genes from selected dataset
sig_genes_320_vs_080 <- selected_results_320_080 %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id")

print(paste("DE genes for 320ng vs 080ng (", dataset_description, "):", nrow(sig_genes_320_vs_080)))
[1] "DE genes for 320ng vs 080ng ( Top 8 samples with lenient filtering ): 13"
if(nrow(sig_genes_320_vs_080) > 0) {
  # Convert to Entrez IDs
  de_genes_entrez_320_080 <- mapIds(org.Mm.eg.db, 
                                    keys = sig_genes_320_vs_080$gene_name, 
                                    column = "ENTREZID", 
                                    keytype = "SYMBOL",
                                    multiVals = "first")
  
  # Remove NA values
  de_genes_entrez_320_080 <- de_genes_entrez_320_080[!is.na(de_genes_entrez_320_080)]
  
  print(paste("DE genes with Entrez IDs:", length(de_genes_entrez_320_080)))
  
  if(length(de_genes_entrez_320_080) > 5) {
    # Use condition-specific universe (genes expressed in both 320ng and 080ng conditions)
    print(paste("Universe size (genes expressed in both 320ng and 080ng):", length(universe_320_080_entrez)))
    print(paste("DE genes for enrichment:", length(de_genes_entrez_320_080)))
    
    # GO enrichment analysis with condition-specific background
    go_results_320_080 <- enrichGO(gene = de_genes_entrez_320_080,
                                   universe = universe_320_080_entrez,
                                   OrgDb = org.Mm.eg.db,
                                   ont = selected_go_category,  # Use selected GO category
                                   pAdjustMethod = "BH",
                                   qvalueCutoff = 0.05,
                                   readable = TRUE)
    
    if(nrow(go_results_320_080) > 0) {
      # Display results table
      knitr::kable(head(go_results_320_080@result, 10), 
                   caption = paste("Top 10 GO", go_description, "terms - 320ng vs 080ng (", dataset_description, ")"))
      
      # Dot plot
      p1 <- dotplot(go_results_320_080, showCategory = 15) + 
        ggtitle(paste("GO", selected_go_category, "Enrichment: 320ng vs 080ng\n", dataset_description))
      print(p1)
      
      # Bar plot
      p2 <- barplot(go_results_320_080, showCategory = 15) + 
        ggtitle(paste("GO", selected_go_category, "Enrichment: 320ng vs 080ng\n", dataset_description))
      print(p2)
      
      # Enrichment map (if enough terms)
      if(nrow(go_results_320_080) >= 5) {
        p3 <- emapplot(pairwise_termsim(go_results_320_080), showCategory = 20) + 
          ggtitle(paste("GO", selected_go_category, "Enrichment Map: 320ng vs 080ng\n", dataset_description))
        print(p3)
      }
    } else {
      cat("No significant GO terms found for 320ng vs 080ng")
    }
  } else {
    cat("Not enough genes with Entrez IDs for enrichment analysis")
  }
} else {
  cat("No DE genes found for 320ng vs 080ng")
}
'select()' returned 1:1 mapping between keys and columns
[1] "DE genes with Entrez IDs: 5"
Not enough genes with Entrez IDs for enrichment analysis

920ng vs 080ng GO Enrichment

# Get significant genes from selected dataset
sig_genes_920_vs_080 <- selected_results_920_080 %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id")

print(paste("DE genes for 920ng vs 080ng (", dataset_description, "):", nrow(sig_genes_920_vs_080)))
[1] "DE genes for 920ng vs 080ng ( Top 8 samples with lenient filtering ): 116"
if(nrow(sig_genes_920_vs_080) > 0) {
  # Convert to Entrez IDs
  de_genes_entrez_920_080 <- mapIds(org.Mm.eg.db, 
                                    keys = sig_genes_920_vs_080$gene_name, 
                                    column = "ENTREZID", 
                                    keytype = "SYMBOL",
                                    multiVals = "first")
  
  # Remove NA values
  de_genes_entrez_920_080 <- de_genes_entrez_920_080[!is.na(de_genes_entrez_920_080)]
  
  print(paste("DE genes with Entrez IDs:", length(de_genes_entrez_920_080)))
  
  if(length(de_genes_entrez_920_080) > 5) {
    # Use condition-specific universe (genes expressed in both 920ng and 080ng conditions)
    print(paste("Universe size (genes expressed in both 920ng and 080ng):", length(universe_920_080_entrez)))
    print(paste("DE genes for enrichment:", length(de_genes_entrez_920_080)))
    
    # GO enrichment analysis with condition-specific background
    go_results_920_080 <- enrichGO(gene = de_genes_entrez_920_080,
                                   universe = universe_920_080_entrez,
                                   OrgDb = org.Mm.eg.db,
                                   ont = selected_go_category,  # Use selected GO category
                                   pAdjustMethod = "BH",
                                   qvalueCutoff = 0.05,
                                   readable = TRUE)
    
    if(nrow(go_results_920_080) > 0) {
      # Display results table
      knitr::kable(head(go_results_920_080@result, 10), 
                   caption = paste("Top 10 GO", go_description, "terms - 920ng vs 080ng (", dataset_description, ")"))
      
      # Dot plot
      p1 <- dotplot(go_results_920_080, showCategory = 15) + 
        ggtitle(paste("GO", selected_go_category, "Enrichment: 920ng vs 080ng\n", dataset_description))
      print(p1)
      
      # Bar plot
      p2 <- barplot(go_results_920_080, showCategory = 15) + 
        ggtitle(paste("GO", selected_go_category, "Enrichment: 920ng vs 080ng\n", dataset_description))
      print(p2)
      
      # Enrichment map (if enough terms)
      if(nrow(go_results_920_080) >= 5) {
        p3 <- emapplot(pairwise_termsim(go_results_920_080), showCategory = 20) + 
          ggtitle(paste("GO", selected_go_category, "Enrichment Map: 920ng vs 080ng\n", dataset_description))
        print(p3)
      }
    } else {
      cat("No significant GO terms found for 920ng vs 080ng")
    }
  } else {
    cat("Not enough genes with Entrez IDs for enrichment analysis")
  }
} else {
  cat("No DE genes found for 920ng vs 080ng")
}
'select()' returned 1:1 mapping between keys and columns
[1] "DE genes with Entrez IDs: 97"
[1] "Universe size (genes expressed in both 920ng and 080ng): 12314"
[1] "DE genes for enrichment: 97"

920ng vs 320ng GO Enrichment

# Get significant genes from selected dataset
sig_genes_920_vs_320 <- selected_results_920_320 %>% 
  filter(adj.P.Val < 0.05) %>%
  rownames_to_column("gene_id") %>%
  left_join(genes, by="gene_id")

print(paste("DE genes for 920ng vs 320ng (", dataset_description, "):", nrow(sig_genes_920_vs_320)))
[1] "DE genes for 920ng vs 320ng ( Top 8 samples with lenient filtering ): 49"
if(nrow(sig_genes_920_vs_320) > 0) {
  # Convert to Entrez IDs
  de_genes_entrez_920_320 <- mapIds(org.Mm.eg.db, 
                                    keys = sig_genes_920_vs_320$gene_name, 
                                    column = "ENTREZID", 
                                    keytype = "SYMBOL",
                                    multiVals = "first")
  
  # Remove NA values
  de_genes_entrez_920_320 <- de_genes_entrez_920_320[!is.na(de_genes_entrez_920_320)]
  
  print(paste("DE genes with Entrez IDs:", length(de_genes_entrez_920_320)))
  
  if(length(de_genes_entrez_920_320) > 5) {
    # Use condition-specific universe (genes expressed in both 920ng and 320ng conditions)
    print(paste("Universe size (genes expressed in both 920ng and 320ng):", length(universe_920_320_entrez)))
    print(paste("DE genes for enrichment:", length(de_genes_entrez_920_320)))
    
    # GO enrichment analysis with condition-specific background
    go_results_920_320 <- enrichGO(gene = de_genes_entrez_920_320,
                                   universe = universe_920_320_entrez,
                                   OrgDb = org.Mm.eg.db,
                                   ont = selected_go_category,  # Use selected GO category
                                   pAdjustMethod = "BH",
                                   qvalueCutoff = 0.05,
                                   readable = TRUE)
    
    if(nrow(go_results_920_320) > 0) {
      # Display results table
      knitr::kable(head(go_results_920_320@result, 10), 
                   caption = paste("Top 10 GO", go_description, "terms - 920ng vs 320ng (", dataset_description, ")"))
      
      # Dot plot
      p1 <- dotplot(go_results_920_320, showCategory = 15) + 
        ggtitle(paste("GO", selected_go_category, "Enrichment: 920ng vs 320ng\n", dataset_description))
      print(p1)
      
      # Bar plot
      p2 <- barplot(go_results_920_320, showCategory = 15) + 
        ggtitle(paste("GO", selected_go_category, "Enrichment: 920ng vs 320ng\n", dataset_description))
      print(p2)
      
      # Enrichment map (if enough terms)
      if(nrow(go_results_920_320) >= 5) {
        p3 <- emapplot(pairwise_termsim(go_results_920_320), showCategory = 20) + 
          ggtitle(paste("GO", selected_go_category, "Enrichment Map: 920ng vs 320ng\n", dataset_description))
        print(p3)
      }
    } else {
      cat("No significant GO terms found for 920ng vs 320ng")
    }
  } else {
    cat("Not enough genes with Entrez IDs for enrichment analysis")
  }
} else {
  cat("No DE genes found for 920ng vs 320ng")
}
'select()' returned 1:1 mapping between keys and columns
[1] "DE genes with Entrez IDs: 34"
[1] "Universe size (genes expressed in both 920ng and 320ng): 12322"
[1] "DE genes for enrichment: 34"

GO Enrichment Summary

# Summary of enrichment results for selected dataset
cat("=== GO ENRICHMENT SUMMARY ===\n")
=== GO ENRICHMENT SUMMARY ===
cat("Selected dataset:", dataset_description, "\n")
Selected dataset: Top 8 samples with lenient filtering 
cat("Selected GO category:", go_description, "(", selected_go_category, ")\n")
Selected GO category: Molecular Function ( MF )
cat("Background strategy: Genes expressed in both conditions of each contrast\n\n")
Background strategy: Genes expressed in both conditions of each contrast
if(exists("go_results_320_080")) {
  cat("320ng vs 080ng: ", nrow(go_results_320_080), " significant GO terms\n")
}
if(exists("go_results_920_080")) {
  cat("920ng vs 080ng: ", nrow(go_results_920_080), " significant GO terms\n")  
}
920ng vs 080ng:  2  significant GO terms
if(exists("go_results_920_320")) {
  cat("920ng vs 320ng: ", nrow(go_results_920_320), " significant GO terms\n")
}
920ng vs 320ng:  3  significant GO terms

DE Gene Count Visualization

Number of Up/Down-regulated Genes per Contrast

This section creates barplots showing the number of up- and down-regulated genes for each contrast across all analysis setups.

# Function to extract DE gene counts
get_de_counts <- function(results_list, setup_name) {
  
  contrasts <- c("320v080", "920v080", "920v320")
  
  de_counts <- data.frame(
    Setup = character(),
    Contrast = character(),
    Direction = character(),
    Count = numeric(),
    stringsAsFactors = FALSE
  )
  
  for(i in 1:length(results_list)) {
    result <- results_list[[i]]
    contrast <- contrasts[i]
    
    # Get significant genes
    sig_genes <- result[result$adj.P.Val < 0.05, ]
    
    # Count up and down regulated
    up_count <- sum(sig_genes$logFC > 0, na.rm = TRUE)
    down_count <- sum(sig_genes$logFC < 0, na.rm = TRUE)
    
    # Add to data frame
    de_counts <- rbind(de_counts, 
                       data.frame(Setup = setup_name,
                                  Contrast = contrast,
                                  Direction = "Up-regulated",
                                  Count = up_count),
                       data.frame(Setup = setup_name,
                                  Contrast = contrast,
                                  Direction = "Down-regulated", 
                                  Count = -down_count))  # Negative for plotting below x-axis
  }
  
  return(de_counts)
}

# Extract DE counts for all setups
all_stringent_counts <- get_de_counts(
  list(results_320_vs_080, results_920_vs_080, results_920_vs_320),
  "All Samples - Stringent"
)

all_lenient_counts <- get_de_counts(
  list(results_320_vs_080_lenient, results_920_vs_080_lenient, results_920_vs_320_lenient),
  "All Samples - Lenient"
)

top8_stringent_counts <- get_de_counts(
  list(results_320_vs_080_top8_stringent, results_920_vs_080_top8_stringent, results_920_vs_320_top8_stringent),
  "Top 8 Samples - Stringent"
)

top8_lenient_counts <- get_de_counts(
  list(results_320_vs_080_top8_lenient, results_920_vs_080_top8_lenient, results_920_vs_320_top8_lenient),
  "Top 8 Samples - Lenient"
)

# Combine all data
all_de_counts <- rbind(all_stringent_counts, all_lenient_counts, top8_stringent_counts, top8_lenient_counts)

# Create individual plots for each setup
library(ggplot2)

# 1. All Samples - Stringent
p1 <- ggplot(all_stringent_counts, aes(x = Contrast, y = Count, fill = Direction)) +
  geom_bar(stat = "identity", position = "identity", width = 0.7) +
  geom_hline(yintercept = 0, color = "black", size = 0.5) +
  scale_fill_manual(values = c("Up-regulated" = "#E31A1C", "Down-regulated" = "#1F78B4")) +
  labs(title = "All Samples - Stringent Filtering",
       x = "Contrast", 
       y = "Number of DE genes",
       fill = "Regulation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        legend.position = "bottom") +
  geom_text(aes(label = abs(Count)), vjust = ifelse(all_stringent_counts$Count > 0, -0.5, 1.5), 
            size = 4, fontface = "bold") +
  scale_y_continuous(labels = abs, limits = c(min(all_de_counts$Count) * 1.2, max(all_de_counts$Count) * 1.2))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
# 2. All Samples - Lenient  
p2 <- ggplot(all_lenient_counts, aes(x = Contrast, y = Count, fill = Direction)) +
  geom_bar(stat = "identity", position = "identity", width = 0.7) +
  geom_hline(yintercept = 0, color = "black", size = 0.5) +
  scale_fill_manual(values = c("Up-regulated" = "#E31A1C", "Down-regulated" = "#1F78B4")) +
  labs(title = "All Samples - Lenient Filtering",
       x = "Contrast", 
       y = "Number of DE genes",
       fill = "Regulation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        legend.position = "bottom") +
  geom_text(aes(label = abs(Count)), vjust = ifelse(all_lenient_counts$Count > 0, -0.5, 1.5), 
            size = 4, fontface = "bold") +
  scale_y_continuous(labels = abs, limits = c(min(all_de_counts$Count) * 1.2, max(all_de_counts$Count) * 1.2))

# 3. Top 8 Samples - Stringent
p3 <- ggplot(top8_stringent_counts, aes(x = Contrast, y = Count, fill = Direction)) +
  geom_bar(stat = "identity", position = "identity", width = 0.7) +
  geom_hline(yintercept = 0, color = "black", size = 0.5) +
  scale_fill_manual(values = c("Up-regulated" = "#E31A1C", "Down-regulated" = "#1F78B4")) +
  labs(title = "Top 8 Samples - Stringent Filtering",
       x = "Contrast", 
       y = "Number of DE genes",
       fill = "Regulation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        legend.position = "bottom") +
  geom_text(aes(label = abs(Count)), vjust = ifelse(top8_stringent_counts$Count > 0, -0.5, 1.5), 
            size = 4, fontface = "bold") +
  scale_y_continuous(labels = abs, limits = c(min(all_de_counts$Count) * 1.2, max(all_de_counts$Count) * 1.2))

# 4. Top 8 Samples - Lenient
p4 <- ggplot(top8_lenient_counts, aes(x = Contrast, y = Count, fill = Direction)) +
  geom_bar(stat = "identity", position = "identity", width = 0.7) +
  geom_hline(yintercept = 0, color = "black", size = 0.5) +
  scale_fill_manual(values = c("Up-regulated" = "#E31A1C", "Down-regulated" = "#1F78B4")) +
  labs(title = "Top 8 Samples - Lenient Filtering", 
       x = "Contrast", 
       y = "Number of DE genes",
       fill = "Regulation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
        legend.position = "bottom") +
  geom_text(aes(label = abs(Count)), vjust = ifelse(top8_lenient_counts$Count > 0, -0.5, 1.5), 
            size = 4, fontface = "bold") +
  scale_y_continuous(labels = abs, limits = c(min(all_de_counts$Count) * 1.2, max(all_de_counts$Count) * 1.2))

# Display all plots
library(gridExtra)

Attaching package: 'gridExtra'
The following object is masked from 'package:Biobase':

    combine
The following object is masked from 'package:BiocGenerics':

    combine
The following object is masked from 'package:dplyr':

    combine
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)

# Print summary table
cat("=== DE GENE COUNT SUMMARY ===\n")
=== DE GENE COUNT SUMMARY ===
summary_table <- all_de_counts %>%
  mutate(Count = abs(Count)) %>%
  pivot_wider(names_from = Direction, values_from = Count) %>%
  mutate(Total = `Up-regulated` + `Down-regulated`) %>%
  arrange(Setup, Contrast)

knitr::kable(summary_table, caption = "Summary of DE gene counts across all analysis setups")
Summary of DE gene counts across all analysis setups
Setup Contrast Up-regulated Down-regulated Total
All Samples - Lenient 320v080 4 0 4
All Samples - Lenient 920v080 29 8 37
All Samples - Lenient 920v320 23 6 29
All Samples - Stringent 320v080 3 0 3
All Samples - Stringent 920v080 17 10 27
All Samples - Stringent 920v320 10 5 15
Top 8 Samples - Lenient 320v080 12 1 13
Top 8 Samples - Lenient 920v080 78 38 116
Top 8 Samples - Lenient 920v320 34 15 49
Top 8 Samples - Stringent 320v080 6 1 7
Top 8 Samples - Stringent 920v080 63 37 100
Top 8 Samples - Stringent 920v320 21 14 35

Individual Setup Plots

All Samples - Stringent Filtering

print(p1)

All Samples - Lenient Filtering

print(p2)

Top 8 Samples - Stringent Filtering

print(p3)

Top 8 Samples - Lenient Filtering

print(p4)